---
title: "Coronavirus Clinical Trials R library"
output: html_notebook
---
### Description

This R project containt some script to format the database of clinical trials on covid-19 in Gargantext readable files (see http://Gargarntext.org).

The database should be in the tsv format (separator = tabulation ; no delimiters) and be formated in UTF8.

### Load of Data
First define what is the name of the file to be processed. This file should be in the folder /data
```{r}
library(lubridate)
source("coronalib.R") # R libraries 
# name<-"Database_covid_studies_registered" # name of the csv to be loaded
# name<-"Database210303vaccine_cleaned" # name of the csv to be loaded
# name<-"Database210609Vaccins" # name of the csv to be loaded
name<-"Database211026" # name of the csv to be loaded

AllData <-read.csv(paste("data/",name,".csv",sep=""),head=TRUE,sep="\t")
AllData$Inclusion.criteria <- NULL
AllData$Exclusion.criteria <- NULL
#AllData <- filter(AllData,!is.na(AllData$Registration.date))
nrow(AllData)
x <- unique(AllData$Trial.registration.number)
#head(x)[[1]]
head(AllData)

```
### Data segmentation
Several dataframe are generated according to which are the CTs under study.

```{r}
library(dplyr) 
library(stringr) 
#Prevention<- filter(AllData,grepl("Prevention",AllData$Study.aim)) # CTs taggés prevention
Prevention <- filter(AllData,grepl("vaccine",tolower(AllData$Treatment.category))) # CTs taggés vaccines
Prevention <- subset(Prevention,Trial.registration.number != "NCT04833101" & Trial.registration.number != "NCT04892459")

#Treatments <- filter(AllData,grepl("Treatment",AllData$Study.aim)) # # CTs taggés Treaments
#Posttreatment<- filter(AllData,grepl("Post treatment",AllData$Study.aim)) # CTs taggé Post-Treatment
#print(paste(count(Prevention)," Prevention arms,", count(Treatments)," Treatments arms and ",count(Posttreatment)," Post-treatment arms."))
```
## Export of data and viz
Data are exported in several formats. The list of all treatments is also exported assuming that treatments are separated by a '+' signe in the column treatment of the original db.

## Quentin 

```{r}
## Write the output vaccine file

## Timestamp

x <- data.frame("publication_day"=day(as.Date(Prevention$Registration.date,"%Y-%m-%d")))
x$publication_day   = as.character(x$publication_day)
x$publication_month = as.character(month(as.Date(Prevention$Registration.date,"%Y-%m-%d")))
x$publication_year  = as.character(year(as.Date(Prevention$Registration.date,"%Y-%m-%d")))

## Authors, title and source

x$authors = Prevention$First.author
x$title   = Prevention$Trial.registration.number

## Phase

# x$source = Prevention$Phase
# x$source = sub("^$", "Not applicable", x$source)
# x$source  = str_replace_all(x$source,"Not applicable ","Not applicable")
# x$source  = str_replace_all(x$source,"Phase 1 ","Phase 1")
# x$source  = str_replace_all(x$source,"/",";")

#source <- unique(x[c("source")])

## Publications

x$source  = Prevention$Publication.date
x$source  = sub("^$", "No", x$source)
x$source  = str_replace_all(x$source,"No ","No")
x$source  = str_replace_all(x$source,"^[0-9].*","Yes")

## Countries
# 
# x$source  = Prevention$Countries
# x$source  = str_replace_all(x$source,",",";")

## Links
# x$source = Prevention$Full.text.link

## Fundings
 
# x$institutes  = str_replace_all(Prevention$Funding,";",",")
# x$institutes  = str_replace_all(x$institutes,"\n"," ")
# x$institutes  = str_replace_all(x$institutes,"Assistance Publique - Hôpitaux de Paris","Assistance Publique Hopitaux de Paris")
# x$institutes  = str_replace_all(x$institutes,"ASTRAZENECA AB","AstraZeneca")
# x$institutes  = str_replace_all(x$institutes,"Bharat Biotech International Limited","Bharat Biotech International Ltd")
# x$institutes  = str_replace_all(x$institutes,"West China Hospital; Sichuan University","West China Hospital Sichuan University")
# x$institutes  = str_replace_all(x$institutes,"West China Hospital, Sichuan University","West China Hospital Sichuan University")
# x$institutes  = str_replace_all(x$institutes,"Valneva \\(Austria\\)","Valneva Austria GmbH")
# x$institutes  = str_replace_all(x$institutes,"Universidade Federal do Rio de Janeiro - RJ, Brazi","Universidade Federal do Rio de Janeiro")
# x$institutes  = str_replace_all(x$institutes,"UMC Utrecht","University Medical Centre Utrecht")
# x$institutes  = str_replace_all(x$institutes,"Shulan \\(Hangzhou\\) Hospital, Center for Disease Control and Prevention of Guangxi Zhuang Autonomous Region","Shulan (Hangzhou) Hospital")
# x$institutes  = str_replace_all(x$institutes,"Sinovac Research and Development Co., Ltd.","Sinovac Biotech Co., Ltd")
# x$institutes  = str_replace_all(x$institutes,"Serum Institute of India Private Limited","Serum Institute of India Pvt Ltd")
# x$institutes  = str_replace_all(x$institutes,"Sanofi Pasteur, a Sanofi Company","Sanofi Pasteur")
# x$institutes  = str_replace_all(x$institutes,"REITHERA SRL","ReiThera Srl")
# x$institutes  = str_replace_all(x$institutes,"Radboudumc","Radboud University")
# x$institutes  = str_replace_all(x$institutes,"Novavax, Inc.","Novavax")
# x$institutes  = str_replace_all(x$institutes,"Jiangsu Province Centers for Disease Control and Prevention","Jiangsu Provincial Center for Disease Control and Prevention")
# x$institutes  = str_replace_all(x$institutes,"Jiangsu Provincial Center For Disease Control and Prevention \\(Public Health Research Institute of Jiangsu Province\\)","Jiangsu Provincial Center for Disease Control and Prevention")
# x$institutes  = str_replace_all(x$institutes,"Jiangsu Provincial Center for Disease Prevention and Control","Jiangsu Provincial Center for Disease Control and Prevention")
# x$institutes  = str_replace_all(x$institutes,"Insitute of Biotechnology, Academy of Military Medical Sciences, PLA of China","Institute of Biotechnology, Academy of Military Medical Sciences, PLA of China")
# x$institutes  = str_replace_all(x$institutes,"Biontech SE","BioNTech SE")
# x$institutes  = str_replace_all(x$institutes,"BioNTech RNA Pharmaceuticals GmbH","BioNTech SE")
# x$institutes  = str_replace_all(x$institutes,"Cadila Pharmaceuticals Limited","Cadila Pharnmaceuticals")
# x$institutes  = str_replace_all(x$institutes,"China National Biotec Group Co.Ltd","China National Biotec Group Company Limited")
# x$institutes  = str_replace_all(x$institutes,"CureVac AG,","CureVac AG")
# #x$source  = str_replace_all(x$source,"Janssen Pharmaceutical K.K.","Janssen Vaccines & Prevention B.V.")
# x$institutes  = str_replace_all(x$institutes,"1. Medical Research Council \\(UK\\)","Medical Research Council (UK)")
# x$institutes  = str_replace_all(x$institutes,"Cadila Healthcare Limited","Cadila Healthcare Ltd")
# x$institutes  = gsub("^Anhui.*", "Anhui Zhifei Longcom Biologic Pharmacy Co.  Ltd.", x$institutes)
# x$institutes  = gsub(" Universidade Federal de Santa Catarina", "Universidade Federal de Santa Catarina", x$institutes)
# x$institutes  = gsub("^Center for Genetic Engineering and Biotechnology.*", "Center for Genetic Engineering and Biotechnology (CIGB)", x$institutes)
# x$institutes  = gsub("Erasmus MC", "Erasmus University Medical Center", x$institutes)
# x$institutes  = gsub("^Finlay Vaccine Institute.*", "Finlay Vaccine Institute (FVI)", x$institutes)
# x$institutes  = gsub("Janssen Research & Development  LLC","Janssen Vaccines & Prevention B.V.",x$institutes)
# x$institutes  = gsub("Karolinska Universitetssjukhuset","Karolinska University Hospital",x$institutes)
# x$institutes  = gsub("^Medigen.*","Medigen Vaccine Biologics Corp.",x$institutes)
# x$institutes  = gsub("^Serum.*","Serum Institute of India Pvt Ltd",x$institutes)
# x$institutes  = gsub("^shulan.*","Shulan (Hangzhou) Hospital",x$institutes)
# x$institutes  = gsub("SOUTH AFRICAN MEDICAL RESEARCH COUNCIL  SAMRC","South African Medical 7505 Council",x$institutes)
# x$institutes  = gsub("^Takis.*","Takis",x$institutes)
# x$institutes  = gsub("UniversitÃ¤tsklinikum Hamburg-Eppendorf","Universitätsklinikum Hamburg-Eppendorf",x$institutes)
# x$institutes  = gsub("Cadila Pharnmaceuticals","Cadila Healthcare Ltd",x$institutes)
# x$institutes  = gsub("^Janssen.*","Janssen Vaccines & Prevention B.V.",x$institutes)
# x$institutes  = gsub("Adagio Therapeutics  Inc.","Adagio Therapeutics Inc",x$institutes)
# x$institutes  = gsub("^Barcelona Institute.*","Barcelona Institute for Global Health",x$institutes)
# x$institutes  = gsub("^Cambridge university Hosptials NHS.*","Cambridge University Hospitals NHS Foundation Trust",x$institutes)
# x$institutes  = gsub("^CinnaGen.*","Cinnagen",x$institutes)
# x$institutes  = gsub("^Medical University of Vienna.*","Medical University of Vienna",x$institutes)
# x$institutes  = gsub("^Merck Sharp.*","Merck Sharp & Dohme Corp.",x$institutes)
# x$institutes  = gsub("^Ministerio de Salud de Ciudad.*","Ministerio de Salud de Ciudad Autónoma de Buenos Aires",x$institutes)
# x$institutes  = gsub("^National Research Council of Thailand (NRCT).*","National Reseach Council  Thailand",x$institutes)
# x$institutes  = gsub("^Peter MacCallum.*","Peter MacCallum Cancer Centre",x$institutes)
# x$institutes  = gsub("^Pfizer.*","Pfizer",x$institutes)
# x$institutes  = gsub("^Rigshospitalet.*","Rigshospitalet",x$institutes)
# x$institutes  = gsub("^Universidade Federal do Rio.*","Universidade Federal do Rio de Janeiro",x$institutes)
# x$institutes  = gsub("^University Of Birmingham.*","University of Birmingham",x$institutes)
# x$institutes  = gsub("^West China Hospital.*","West China Hospital of Sichuan University",x$institutes)
# x$institutes  = gsub("^WestVac Biopharma.*","WestVac Biopharma",x$institutes)
# 
# x$institutes  = str_replace_all(x$institutes,","," ")
# x$institutes  = str_replace_all(x$institutes,";"," ")
# x$institutes  = gsub("\"","",x$institutes)
# x$source = x$institutes

# source <- unique(x[c("source")])

## Abstract

for (i in 1:nrow(Prevention)) {
    tmp_date  <- Prevention[i, "Registration.date"]
    tmp_year <- as.numeric(year(as.Date(tmp_date,"%Y-%m-%d")))
    if (tmp_year < 2021) {
      Prevention[i,"Treatment.category"] <- gsub("VOC","" ,Prevention[i,"Treatment.category"])
      Prevention[i,"Treatment.category"] <- gsub("VB",""  ,Prevention[i,"Treatment.category"])
      Prevention[i,"Treatment.category"] <- gsub("HETI","",Prevention[i,"Treatment.category"])
    }
}

x$abstract = paste(tolower(Prevention$Treatment.category)," . ",str_replace_all(tolower(Prevention$Treatment.type),"[[+]]"," ; ")," . ",str_replace_all(tolower(Prevention$Treatment.name),"[[+]]"," ; "))
x$abstract  = str_replace_all(x$abstract,"\n"," ")
x$abstract  = gsub("\"","",x$abstract)
x$abstract  = gsub("/"," ",x$abstract)

## Weight

#x$weight = Prevention$n.randomized.in.this.arm
x$weight = Prevention$Total.sample.size
x <- subset(x,weight != "NA" & weight != "Not reported" &  weight != "Not reported " & weight != "University Medical Center" )

## Output

write.table(x, file = paste("output/corpus-26-10-2021-fundings",".csv",sep = ""), sep = "\t",row.names = FALSE)
```

```{r}
source("coronalib.R")
library(reshape)
library(wordcloud)
library(ggplot2)

## Exporting for phylo
garg_export_for_phylo(Treatments,"AllData")
garg_export_for_phylo_types_only(AllData,"AllData")

# Html format
# export of a corpus with treatments and outcomes
garg_export_with_html(Treatments,"Treatment") # exporte Treatmeant et Outcomes des essais cliniques de type Treatment
garg_export_with_html(AllData,"AllData") # exporte Treatmeant et Outcomes des essais cliniques de tous types
garg_export_treatments_with_html(AllData,"AllData") # exporte Treatmeant des essais cliniques de tous types


# export of the list of all types of treatments whatever the phase in the format Gargantext map list Gargantext V3 & V4
gargV4_export_treaments_list(AllData,"AllDb")
gargV3_export_treaments_list(AllData,"AllDb")
gargV3_export_treamentsTypes_list(AllData,"AllDb")


# Conversion of the tsv file into Gargantext readable tsv dile
# Seleciton of the kind of CT to export : All / Prevention / Treatment / Post-treatment
# Selection of the kind of informations to include in the main text to be processed by Gargantext (bastract column): Treatmeant and/or Outcomes


# simple txt export
garg_export_all_plain(Treatments,"Treatment") # export main information in plain text
garg_export_OnlyTreatments(Treatments,"Treatment") # export only treatments in plain text
garg_export_OnlyOutcomes(Treatments,"Treatment") # export only outcomes in plain text
garg_export_all_plain(AllData,"All") # export main information in plain text

# raw export (just to have specific maps)
garg_export_raw_treatments(AllData,"AllData") ## export only info relative to treatments without any formating.

# Some simple viz - Tag cloud of the treaments per category of CT
TreatmentsCloud(Treatments)
TreatmentsCloud(Prevention)
TreatmentsCloud(Posttreatment)

```
```{r}
  df <- AllData
  x <- data.frame("publication_day"=day(as.Date(df$Registration.date,"%Y-%m-%d")))
  x$publication_month = month(as.Date(df$Registration.date,"%Y-%m-%d"))
  x$publication_year = format((as.Date(df$Registration.date,"%Y-%m-%d")),"%W")
  x$authors=df$First.author
  x$title=df$Trial.registration.number
  x$source=df$Funding
  x$abstract=paste(toupper(df$Pharmacological.treatment)," . ",str_replace_all(tolower(df$Treatment.type),"[[+]]"," ; ")," . ",str_replace_all(tolower(df$Treatment.name),"[[+]]"," ; "))
  y <- filter(x,x$publication_year!="NA")
  write.table(y, file = paste("output/CTphylo",filename,".csv",sep = ""), sep = "\t",row.names = FALSE)

```



